This document is mainly for part3 of the GEO data set analysis, that is the quality control of data sets from different tissue types.

I did PCA on control porbes, so we can see whether there are bach effects between different studies, but within same type of tissue.

1. load data and packages

library(tidyverse)
library(minfi)
library(magrittr)
library(tibble)
library(data.table)
library(limma)
library(ewastools)
library(FactoMineR)
library(doParallel)
library(ggplot2)
library(RColorBrewer)
pal <- brewer.pal(8, "Dark2")

# GEOmeta, GEO_phenotypes
load(file = "/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_phenotypes_add10.RData")

#GEO_RGset_450k_normalPlacenta, GEO_phenoData_450k_normalPlacenta, 
#GEO_RGset_EPIC_normalPlacenta, GEO_phenoData_EPIC_normalPlacenta, 
#GEO_RGset_EPIC_normalPlacenta_ourStudy, GEO_phenoData_EPIC_normalPlacenta_ourStudy
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalPlacenta.RData")

#GEO_RGset_450k_Amnion, GEO_phenoData_450k_Amnion, GEO_RGset_EPIC_Amnion, GEO_phenoData_EPIC_Amnion
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalAmnion.RData")

#GEO_RGset_450k_Chorion, GEO_phenoData_450k_Chorion,GEO_RGset_EPIC_Chorion, GEO_phenoData_EPIC_Chorion
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalChorion.RData")

#GEO_RGset_450k_Decidua, GEO_phenoData_450k_Decidua, GEO_RGset_EPIC_Decidua, GEO_phenoData_EPIC_Decidua, 
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalDecidua.RData")

#GEO_RGset_450k_MaternalBlood, GEO_phenoData_450k_MaternalBlood, 
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalMaternalBlood.RData")

#GEO_RGset_450k_mesenchyme, GEO_phenoData_450k_mesenchyme
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalPlacentalMesenchyme.RData")

#GEO_RGset_450k_trophoblast, GEO_phenoData_450k_trophoblast
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalPlacentalTrophoblast.RData")

#GEO_RGset_450k_UC, GEO_phenoData_450k_UC
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalUmbilicalCord.RData")

#GEO_RGset_450k_UCB, GEO_phenoData_450k_UCB
load(file ="/home/qianhui/DNAme/Process_decidual/RDSfiles/GEO_RG&pData_normalUmbilicalCordBlood.RData")

2. look at PCA of control probes

Function for control probe PCA, to identify potential batch effects.

# let argument RGset to be a charactor
# argument `group`, group could be `trimester`, `fetalSex` `gestationalAge` or `Study`  

ControlPCA <- function(RGsetChara, group){
  registerDoParallel(cores = 10)
  
  # get the object from character
  RGset <- get(RGsetChara)
  # get meta data
  phenoData <- pData(RGset)

  # get intensity of control probes:
  ctrls <- getProbeInfo(RGset, type = "Control")

  M.Neg <- getGreen(RGset)[ctrls$Address[ctrls$Type == "NEGATIVE"], , drop=FALSE]
  U.Neg <- getRed(RGset)[ctrls$Address[ctrls$Type == "NEGATIVE"], , drop=FALSE]
  
  # calculate beta values and M values
  beta_ctrls <- M.Neg/(M.Neg+U.Neg+100) %>% as.matrix()
  M_ctrls <- log2(beta_ctrls/(1 - beta_ctrls))
  M_ctrls_v1 <- M_ctrls[apply(M_ctrls, 1, function(x) all(is.finite(x))), , drop=FALSE]

  # remove infinite values and draw PCA plots for data sets with mutiple samples
  if(ncol(M_ctrls)==1){
    
    print("no PCA plot, since only 1 sample for this data set")
    
  }else if(ncol(M_ctrls)==2){
    
    print("no PCA plot, since only 2 samples for this data set")
    
  }else{

  ## PCA plot for control M values
  PCA_M_ctrl <- FactoMineR::PCA(base::t(M_ctrls_v1))

  ## relabel PCA_M_ctrl plot using ggplot:
  PCA_M_ctrl_plot.df <- data.frame(x = PCA_M_ctrl$ind$coord[,1], y = PCA_M_ctrl$ind$coord[,2],
                                 fetalSex=phenoData$Fetal_Sex,
                                 gestationalAge=phenoData$Gestation,
                                 trimester=phenoData$Trimester,
                                 Study= phenoData$Study)
  
  PCA_M_ctrl_plot <-
  PCA_M_ctrl_plot.df %>%
    ggplot(aes(x = x, y = y)) +
  ## add argument `group`, group could be `trimester`, `fetalSex` `gestationalAge` or `Study`  
    geom_point(aes(col = get(group)), alpha = 1, size = 7)+ #color = factor(Batch),
    scale_colour_hue()+
    labs(x="PCA1", y="PCA2", colour=group)+
    ggtitle(RGsetChara)+
    theme(text = element_text(size=20))
  
  print(PCA_M_ctrl_plot)
    
  } # end if
  
  # values to return
  ControlBMList <- list(ControlB=beta_ctrls, ControlM=M_ctrls_v1)
  return(ControlBMList)
  # saveRDS(PCA_M_ctrl, file = "/home/qianhui/DNAme/Process_decidual/RDSfiles/PCA_M_ctrl.rds")
}

Use FUN – ControlPCA for each tissue types: 450K & EPIC (data from GEO data base)

# !(GEO_phenotypes$Sample_name%in%c("GSM1617002_7668610068_R01C01","GSM1616993_7668610068_R06C02"))

GEO_RGset_450k_normalPlacenta <- GEO_RGset_450k_normalPlacenta[,-5]
GEO_RGset_450k_MaternalBlood <- GEO_RGset_450k_MaternalBlood[,-8]

TissueRGsetNames <- c(
                      "GEO_RGset_450k_normalPlacenta", "GEO_RGset_EPIC_normalPlacenta",
                      "GEO_RGset_EPIC_normalPlacenta_ourStudy",
                      "GEO_RGset_450k_Amnion", "GEO_RGset_EPIC_Amnion",
                      "GEO_RGset_450k_Chorion", "GEO_RGset_EPIC_Chorion",
                      "GEO_RGset_450k_Decidua", "GEO_RGset_EPIC_Decidua",
                      "GEO_RGset_450k_MaternalBlood", "GEO_RGset_450k_mesenchyme",
                      "GEO_RGset_450k_trophoblast", 
                      "GEO_RGset_450k_UC", "GEO_RGset_450k_UCB"
                      )


for(i in TissueRGsetNames){
    assign(paste0("ControlBM", str_replace_all(i, "GEO_RGset", "")),
           ControlPCA(RGsetChara = i, group = "Study"))
}
## Loading required package: IlluminaHumanMethylation450kmanifest

## Loading required package: IlluminaHumanMethylationEPICmanifest

## [1] "no PCA plot, since only 1 sample for this data set"
## [1] "no PCA plot, since only 2 samples for this data set"

## [1] "no PCA plot, since only 1 sample for this data set"

ControlBM_450k_normalPlacenta_plotByTrimester <- ControlPCA(RGsetChara = "GEO_RGset_450k_normalPlacenta", group = "trimester")

save(list=c(paste0("ControlBM", str_replace_all(i, "GEO_RGset", ""))),
     file = "/home/qianhui/DNAme/Process_decidual/RDSfiles/ControlBMof9TissueTypes.RData")

3. Put all control probes together

Function JoinAndPlotAllControl

ControlBMvalInfo <- tibble(
  TissueRGsetNames = c("GEO_RGset_450k_normalPlacenta", "GEO_RGset_EPIC_normalPlacenta",
                       "GEO_RGset_EPIC_normalPlacenta_ourStudy",
                       "GEO_RGset_450k_Amnion", "GEO_RGset_EPIC_Amnion",
                       "GEO_RGset_450k_Chorion", "GEO_RGset_EPIC_Chorion",
                       "GEO_RGset_450k_Decidua", "GEO_RGset_EPIC_Decidua",
                       "GEO_RGset_450k_MaternalBlood", "GEO_RGset_450k_mesenchyme",
                       "GEO_RGset_450k_trophoblast", 
                       "GEO_RGset_450k_UC", "GEO_RGset_450k_UCB"
                      ),
  ControlBMvalNames = paste0("ControlBM", str_replace_all(TissueRGsetNames, "GEO_RGset", ""))
)


# write the function to join all control informations together
# value-- Beta or M
# TissueBMvalInfo -- a tibble containing BM names

JoinAndPlotAllControl <- 
  function(ControlBMvalInfo, value, GEO_phenotypes){
  registerDoParallel(cores = 10)
  # list
  ListToJoin <- list()

  # names for the list
  listElements <- paste0(ControlBMvalInfo$ControlBMvalNames, ".df")
  # fill the list with beta/M values
  if(value=="Beta"){
    for(Name in listElements){
      TissueBMvalNames <- str_replace_all(Name, ".df", "")
      ListToJoin[[Name]] <- get(TissueBMvalNames)$ControlB %>% as.data.frame() %>% rownames_to_column(var="Probe")
    }
    
  }else if(value=="M"){
    for(Name in listElements){
      TissueBMvalNames <- str_replace_all(Name, ".df", "")
      ListToJoin[[Name]] <- get(TissueBMvalNames)$ControlM %>% as.data.frame() %>% rownames_to_column(var="Probe")
    }
  } # end if

  # left_join values form all samples
  Join.df <- ListToJoin %>% purrr::reduce(left_join, by = "Probe")
  # omit na values
  Join.df_v1 <- na.omit(Join.df)
  # as matrix
  Join.df_v2 <- Join.df_v1 %>% remove_rownames() %>% 
  column_to_rownames(var="Probe") %>% as.matrix() # 
  
  #plotMDS plots
  ## phenodata for these samples
  GEO_phenotypes_551 <- GEO_phenotypes[match(colnames(Join.df_v2),
                                                 GEO_phenotypes$Sample_name),]
  ## check names all in same order or not
  table(colnames(Join.df_v2)==GEO_phenotypes_551$Sample_name) %>% print()

  ## plot MDS plot
  par(mfrow=c(1,1))
  MDS_control <- plotMDS(Join.df_v2, top=nrow(Join.df_v2), labels= GEO_phenotypes_551$Sample, gene.selection="common") 

  # plot PCA
   PCA_control <- FactoMineR::PCA(base::t(Join.df_v2))
  
  # return Join.df_v2, GEO_phenotypes_norm552 and MDS_norm552
  output_List <- list(JoinAllControl=Join.df_v2, phenoDataAll=GEO_phenotypes_551, 
                      MDS_control=MDS_control, PCA_control=PCA_control)
  return(output_List)
}

use FUN JoinAndPlotAllControl

Control551_Beta <- JoinAndPlotAllControl(ControlBMvalInfo =  ControlBMvalInfo, 
                                         value = "Beta", 
                                         GEO_phenotypes = GEO_phenotypes)
## 
## TRUE 
##  551

Control551_M <- JoinAndPlotAllControl(ControlBMvalInfo =  ControlBMvalInfo, 
                                      value = "M", 
                                      GEO_phenotypes = GEO_phenotypes)
## 
## TRUE 
##  551

save(Control551_Beta, Control551_M,
     file = "/home/qianhui/DNAme/Process_decidual/RDSfiles/Controls_BM_all551.RData")